DATA PRE-TREATMENT
In this section, we import and treat the data, as well as select relevant variables.
Generation of data
To be able to run this analysis, download the YRBS data, convert it into CSV, save both files to data/yrbs, generate the OP YRBS dataset and save it to data/op_export/1_1_prep_yrbs_data.csv.
We downloaded the latest YRBS data in the MDB format from the CDC on 22 October 2019. We then converted it into a CSV file using Microsoft Access 2019. The files have the following MD5 checksums:
files = c(
file.path(PATH, 'data/yrbs/yrbs_SADC_2017_National.MDB'),
file.path(PATH, 'data/yrbs/yrbs_SADC_2017_National.csv')
)
md5s = md5sum(files)
names(md5s) = basename(files)
md5s
## yrbs_SADC_2017_National.MDB yrbs_SADC_2017_National.csv
## "55d6d502955a2cda631f91930648442a" "e2ceff2b7a093b64bea5c94a42d17723"
We then ran the script 1_1_prep_yrbs.R from OP’s replication files, after adjusting the input and output file paths. The resulting dataset has the following MD5 checksum:
yrbs_path = file.path(PATH,'data/op_export/1_1_prep_yrbs_data.csv')
md5s = md5sum(yrbs_path)
names(md5s) = basename(yrbs_path)
md5s
## 1_1_prep_yrbs_data.csv
## "0ac7e2fe5e2e0f31a037f9392d84350e"
Import data
Import the data, as generated by OP script. Remove constant variables.
data= read.csv(yrbs_path,header=TRUE)
data= data[, !(colnames(data) %in% c('sitecode','sitename','sitetype','sitetypenum'))]
Treatment variables
x_vars_op= c("q81_n", "q82_n", "tech")
x_names_op= c("TV Use", "Electronic Device Use", "Mean Technology Use")
x= data[,x_vars_op]; names(x)= x_names_op
Outcome variables
Turn the outcome variables into what they say: 0=no, 1=yes.
y_vars_op= c("q26_n", "q27_n", "q28_n", "q29_nd", "q30_nd")
y_names= c("loneliness", "think suicide", "plan suicide", "commit suicide", "doctor suicide")
for (v in y_vars_op) {data[[v]] = 1 - data[[v]]}
y= data[,y_vars_op]; names(y)= y_names
Control variables
Indicate controls used in YRBS study (adolescent’s race) and additional controls. Discretize BMI according to the standard definition.
cvars= c("race_di")
c_names= c("dichotomous race")
data$bmir= cut(data[,'bmi'], breaks=c(0,18.5,25,30,Inf)) #discretize body mass index
levels(data$bmir)= c('underw','normal','overw','obese')
cvarsplus= c("age_n", "sex_n", "grade_n", "year", "bmir") #survyear same as year
Re-coding variables
Based on the below analyses we make the following adjustments to the variables, as coded and used by OP.
TV usage has a non-monotonic (U-shaped) treatment effect on all outcomes. Discretize the variable into low, medium and high usage, using cut-offs revealed by the linearity analysis.
data$q81_nr= cut(data[,'q81_n'], breaks=c(0,2,6,Inf)) #discretize tv use
levels(data$q81_nr)= c('low', 'medium', 'high')
Normalize the ED usage values to [0, 1], which makes the coefficient more easily interpretable (difference between min and max usage).
data$q82_nr = (data$q82_n - 1)/6
Mean technology use is a linear combination of TV and ED use. Leave it out of the regression.
x_vars= c("q81_nr", "q82_nr")
x_names= c("TV Use", "Electronic Device Use")
Age 12 and 13 have very low number of observations (are very young for grades 9-12). They also have different effects, than age in general (which appears roughly linear). Include them in the regression to control for differential effects at these ages.
data$age12 = data$age_n == 12
data$age13 = data$age_n == 13
for (v in c('age12', 'age13')) { data[,v]= as.numeric(data[,v]) }
Set commit and doctor to 1 (did not attempt suicide and did not see a doctor about it) if both think and plan are 1 (did not think or plan suicide) to solve a problem with missing values (see below).
y_vars = y_vars_op= c("q26_n", "q27_n", "q28_n", "q29_ndr", "q30_ndr")
data$q29_ndr = data$q29_nd
data$q30_ndr = data$q30_nd
cond= replace_na(data[y_vars[2]] == 1 & data[,y_vars[3]] == 1, FALSE)
data[cond, y_vars[4:5]]= 1
y= data[,y_vars]; names(y)= y_names
EXPLORATORY DATA ANALYSIS
Format categorical variables as factors.
xf= x
dataf= data
nn= c('TV Use','Electronic Device Use')
for (i in 1:length(nn)) xf[,nn[i]]= factor(xf[,nn[i]])
nn= c('age_n','sex_n','grade_n','year')
for (i in 1:length(nn)) dataf[,nn[i]]= factor(dataf[,nn[i]])
Value counts
Outcomes.
apply(y,2,'table')
## loneliness think suicide plan suicide commit suicide doctor suicide
## 0 52206 62365 64178 56985 57104
## 1 22090 11932 9741 9032 7747
Treatments.
1 = No usage 2 = Less than 1 hour per day 3 = 1 hour per day 4 = 2 hours per day 5 = 3 hours per day 6 = 4 hours per day 7 = 5 or more hours per day
apply(xf,2,'table') #table for the treatments
## $`TV Use`
##
## 1 2 3 4 5 6 7
## 9223 12466 10615 15580 11566 5762 8118
##
## $`Electronic Device Use`
##
## 1 2 3 4 5 6 7
## 12669 13517 10638 11902 8924 5202 10519
##
## $`Mean Technology Use`
##
## 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5 6.0 6.5 7.0
## 2949 3882 5797 7540 9230 8868 9655 7658 5415 4488 3096 1522 2978
Controls.
apply(dataf[,c(cvars,cvarsplus)],2,'table') #table for the controls
## $race_di
##
## 0 1
## 42410 31133
##
## $age_n
##
## 12 13 14 15 16 17 18
## 163 84 7608 17325 19038 19032 11497
##
## $sex_n
##
## 0 1
## 37402 37412
##
## $grade_n
##
## 10 11 12 9
## 18191 18819 18523 18985
##
## $year
##
## 2007 2009 2011 2013 2015
## 14041 16410 15425 13583 15624
##
## $bmir
##
## normal obese overw underw
## 43303 7243 13044 6311
Missing values
There are quite a few NA’s for planning/committing suicide, maybe not everyone was asked these questions.
colSums(is.na(y))
## loneliness think suicide plan suicide commit suicide doctor suicide
## 787 786 1164 9066 10232
This is not an issue with the year; the question was asked in every wave.
dataf[, c('year', 'q29_nd')] %>% group_by(year) %>% summarise(commit_na = sum(is.na(q29_nd)))
| 2007 |
1557 |
| 2009 |
1801 |
| 2011 |
1911 |
| 2013 |
1601 |
| 2015 |
3057 |
Most of the participants who did not answer these questions said they had not thought about or planned a suicide. They may have just skipped the next two questions (about commiting suicide and being treated by a doctor for a suicide attempt).
dataf[, c('q27_n', 'q28_n', 'q29_nd', 'q30_nd')] %>% rename(think= q27_n, plan = q28_n) %>% group_by(think, plan) %>%
summarise(commit_na = sum(is.na(q29_nd)), doctor_na = sum(is.na(q30_nd)))
## `summarise()` has grouped output by 'think'. You can override using the `.groups` argument.
| 0 |
0 |
7347 |
8318 |
| 0 |
1 |
322 |
371 |
| 0 |
NA |
76 |
120 |
| 1 |
0 |
666 |
746 |
| 1 |
1 |
861 |
971 |
| 1 |
NA |
34 |
47 |
| NA |
0 |
33 |
39 |
| NA |
1 |
16 |
20 |
| NA |
NA |
572 |
571 |
ywithNA= y
for (i in 1:ncol(ywithNA)) { z= as.character(ywithNA[,i]); z[is.na(z)]= "NA"; ywithNA[,i]= factor(z) }
table(ywithNA[,'plan suicide'], ywithNA[,'commit suicide'])
##
## 0 1 NA
## 0 54758 1374 8046
## 1 1840 7563 338
## NA 387 95 682
We solve this by setting commit and doctor to 1 (did not attempt suicide and did not see a doctor about it) if both think and plan are 1 (did not think or plan suicide) above. This solves the NA puzzle.
colSums(is.na(data[,y_vars]))
## q26_n q27_n q28_n q29_ndr q30_ndr
## 787 786 1164 9066 10232
Correlations
Treatments. Note: Mean use is a linear combination of TV and ED use.
xnum= apply(xf,2,as.numeric)
round(cor(xnum,use='pairwise.complete.obs'),3)
## TV Use Electronic Device Use Mean Technology Use
## TV Use 1.000 0.207 0.754
## Electronic Device Use 0.207 1.000 0.798
## Mean Technology Use 0.754 0.798 1.000
Outcomes.
round(cor(y,use='pairwise.complete.obs'),3)
## loneliness think suicide plan suicide commit suicide
## loneliness 1.000 0.428 0.372 0.395
## think suicide 0.428 1.000 0.629 0.797
## plan suicide 0.372 0.629 1.000 0.797
## commit suicide 0.395 0.797 0.797 1.000
## doctor suicide 0.384 0.782 0.850 0.923
## doctor suicide
## loneliness 0.384
## think suicide 0.782
## plan suicide 0.850
## commit suicide 0.923
## doctor suicide 1.000
Controls. Correlation between age and grade is high but not perfect, it may be possible to disentangle the effect of each on wellbeing.
datanum= sapply(c(cvars,cvarsplus), function(z) as.numeric(dataf[,z]))
round(cor(datanum, use='pairwise.complete.obs'),3)
## race_di age_n sex_n grade_n year bmir
## race_di 1.000 0.001 0.008 0.004 0.013 -0.082
## age_n 0.001 1.000 0.036 0.863 -0.019 0.128
## sex_n 0.008 0.036 1.000 0.003 0.006 0.060
## grade_n 0.004 0.863 0.003 1.000 -0.013 0.110
## year 0.013 -0.019 0.006 -0.013 1.000 0.008
## bmir -0.082 0.128 0.060 0.110 0.008 1.000
Bivariate pmf to explore the dependence between TV and electronic device use.
ypairs= combn(ncol(y),2)
tab= lapply(1:ncol(ypairs), function(i) table(y[,ypairs[1,i]],y[,ypairs[2,i]]))
names(tab)= sapply(1:ncol(ypairs), function(i) paste(names(y)[ypairs[,i]], collapse=','))
plotxtab(xf[,1], xf[,2], xlab=names(xf)[1], ylab= names(xf)[2])
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: RColorBrewer

Focus on the two extreme and most frequent categories of electronic device use.
tab= t(table(xf[,1],xf[,2])[,c(1,6)]); tab= 100 * tab/rowSums(tab) #ISSUE: slight negative association between TV & electronic device use
rownames(tab)= paste(names(xf)[2], '=',c(1,6))
barplot(tab, xlab=names(xf)[1], beside=TRUE, ylab='% adolescents', legend=TRUE)

LINEARITY OF TREATMENT AND COVARIATE EFFECTS
Outcome 1. Loneliness
idy= 1; idx= c(1,2)
datareg= data.frame(y[,idy], xf[,idx], dataf[,c(cvars,cvarsplus)])
names(datareg)[1]= 'y'
names(datareg)[2:(1+length(idx))]= names(x)[idx]
sel= rowSums(is.na(datareg))==0
datareg= datareg[sel,]
Fit logistic regression model via MLE. Higher TV use associated to less loneliness, higher ED use to more loneliness. The effect of age is mainly from 12 to $$13 years old. There’s higher loneliness at higher grades. Sex has a clear effect. Higher BMI associated to less loneliness (not intuitive why this might be the case).
fit1= glm(y ~ ., data=datareg, family=binomial(link='logit'))
bmle= getci(fit1)
summary(fit1)
##
## Call:
## glm(formula = y ~ ., family = binomial(link = "logit"), data = datareg)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.469 -0.887 -0.669 1.292 2.042
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.117251 0.377848 0.310 0.756322
## `TV Use`2 -0.063439 0.032119 -1.975 0.048250 *
## `TV Use`3 -0.233508 0.034062 -6.855 7.11e-12 ***
## `TV Use`4 -0.192136 0.031230 -6.152 7.63e-10 ***
## `TV Use`5 -0.237746 0.033324 -7.134 9.73e-13 ***
## `TV Use`6 -0.199444 0.039965 -4.990 6.02e-07 ***
## `TV Use`7 -0.129120 0.036560 -3.532 0.000413 ***
## `Electronic Device Use`2 -0.094080 0.030132 -3.122 0.001795 **
## `Electronic Device Use`3 -0.009967 0.031958 -0.312 0.755129
## `Electronic Device Use`4 0.076511 0.030899 2.476 0.013280 *
## `Electronic Device Use`5 0.196857 0.033036 5.959 2.54e-09 ***
## `Electronic Device Use`6 0.351734 0.038589 9.115 < 2e-16 ***
## `Electronic Device Use`7 0.576534 0.031511 18.296 < 2e-16 ***
## race_di -0.153805 0.018188 -8.456 < 2e-16 ***
## age_n13 -0.785938 0.490908 -1.601 0.109379
## age_n14 -0.620994 0.376640 -1.649 0.099195 .
## age_n15 -0.502233 0.375603 -1.337 0.181177
## age_n16 -0.289247 0.375302 -0.771 0.440881
## age_n17 -0.203815 0.375452 -0.543 0.587232
## age_n18 -0.178269 0.376242 -0.474 0.635632
## sex_n1 -0.854301 0.018000 -47.461 < 2e-16 ***
## grade_n10 -0.172331 0.032259 -5.342 9.19e-08 ***
## grade_n11 -0.276281 0.041843 -6.603 4.04e-11 ***
## grade_n12 -0.386364 0.050071 -7.716 1.20e-14 ***
## year2009 -0.120919 0.027453 -4.405 1.06e-05 ***
## year2011 -0.060457 0.027876 -2.169 0.030098 *
## year2013 -0.078425 0.028905 -2.713 0.006664 **
## year2015 -0.058059 0.028289 -2.052 0.040137 *
## bmirnormal 0.066453 0.031454 2.113 0.034626 *
## bmiroverw 0.155732 0.035835 4.346 1.39e-05 ***
## bmirobese 0.219424 0.040036 5.481 4.24e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 80469 on 66303 degrees of freedom
## Residual deviance: 77214 on 66273 degrees of freedom
## AIC: 77276
##
## Number of Fisher Scoring iterations: 4
Graphically. It looks like treatment effects can roughly be captured as linear for ED, but not for TV use (which has a U-shaped effect). BMI, age, grade etc also appear fairly linear.
variables = c(names(x)[idx], cvarsplus)[-4]
plots = list()
for (v in variables) {
sel= grep(v,rownames(bmle))
plot(bmle[sel,1],ylim=range(bmle[sel,]), xaxt='n')
segments(x0=1:length(sel),y0=bmle[sel,2],y1=bmle[sel,3])
axis(side=1, at=1:length(sel), labels=rownames(bmle)[sel])
title(v)
}






Outcome 2. Think suicide
idy= 2; idx= c(1,2)
datareg= data.frame(y[,idy], xf[,idx], dataf[,c(cvars,cvarsplus)])
names(datareg)[1]= 'y'
names(datareg)[2:(1+length(idx))]= names(x)[idx]
fit1= glm(y ~ ., data=datareg, family=binomial(link='logit'))
bmle= getci(fit1)
summary(fit1)
##
## Call:
## glm(formula = y ~ ., family = binomial(link = "logit"), data = datareg)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5945 -0.6355 -0.5172 -0.4210 2.3676
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.194461 0.374606 0.519 0.603686
## `TV Use`2 -0.156159 0.038815 -4.023 5.74e-05 ***
## `TV Use`3 -0.320939 0.041816 -7.675 1.65e-14 ***
## `TV Use`4 -0.280481 0.037991 -7.383 1.55e-13 ***
## `TV Use`5 -0.284635 0.040553 -7.019 2.24e-12 ***
## `TV Use`6 -0.235128 0.048773 -4.821 1.43e-06 ***
## `TV Use`7 -0.116827 0.043610 -2.679 0.007386 **
## `Electronic Device Use`2 -0.048905 0.038118 -1.283 0.199494
## `Electronic Device Use`3 -0.005226 0.040506 -0.129 0.897343
## `Electronic Device Use`4 0.117252 0.038685 3.031 0.002438 **
## `Electronic Device Use`5 0.204959 0.041139 4.982 6.29e-07 ***
## `Electronic Device Use`6 0.290916 0.047807 6.085 1.16e-09 ***
## `Electronic Device Use`7 0.591975 0.037882 15.627 < 2e-16 ***
## race_di 0.051404 0.022433 2.291 0.021937 *
## age_n13 -1.651279 0.536729 -3.077 0.002094 **
## age_n14 -1.524644 0.372691 -4.091 4.30e-05 ***
## age_n15 -1.459371 0.371112 -3.932 8.41e-05 ***
## age_n16 -1.346693 0.370510 -3.635 0.000278 ***
## age_n17 -1.273160 0.370612 -3.435 0.000592 ***
## age_n18 -1.345002 0.371956 -3.616 0.000299 ***
## sex_n1 -0.781669 0.022620 -34.557 < 2e-16 ***
## grade_n10 -0.111711 0.038972 -2.866 0.004151 **
## grade_n11 -0.250419 0.051071 -4.903 9.42e-07 ***
## grade_n12 -0.392418 0.061701 -6.360 2.02e-10 ***
## year2009 -0.070638 0.034778 -2.031 0.042244 *
## year2011 0.013969 0.034910 0.400 0.689048
## year2013 0.010916 0.035972 0.303 0.761536
## year2015 0.093883 0.034843 2.695 0.007049 **
## bmirnormal -0.015476 0.038316 -0.404 0.686277
## bmiroverw 0.141643 0.043590 3.249 0.001156 **
## bmirobese 0.343084 0.047738 7.187 6.63e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 58089 on 66302 degrees of freedom
## Residual deviance: 56061 on 66272 degrees of freedom
## (8780 observations deleted due to missingness)
## AIC: 56123
##
## Number of Fisher Scoring iterations: 4
variables = c(names(x)[idx], cvarsplus)[-4]
plots = list()
for (v in variables) {
sel= grep(v,rownames(bmle))
plot(bmle[sel,1],ylim=range(bmle[sel,]), xaxt='n', ylab='Estimated coefficient')
segments(x0=1:length(sel),y0=bmle[sel,2],y1=bmle[sel,3])
axis(side=1, at=1:length(sel), labels=rownames(bmle)[sel])
title(v)
if (v == 'TV Use') {
supplement$lin_tv_think = recordPlot()
}
}






Outcome 3. Plan suicide
idy= 3; idx= c(1,2)
datareg= data.frame(y[,idy], xf[,idx], dataf[,c(cvars,cvarsplus)])
names(datareg)[1]= 'y'
names(datareg)[2:(1+length(idx))]= names(x)[idx]
fit1= glm(y ~ ., data=datareg, family=binomial(link='logit'))
bmle= getci(fit1)
summary(fit1)
##
## Call:
## glm(formula = y ~ ., family = binomial(link = "logit"), data = datareg)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3627 -0.5615 -0.4831 -0.3967 2.4206
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.233590 0.400532 -0.583 0.559759
## `TV Use`2 -0.219193 0.041850 -5.238 1.63e-07 ***
## `TV Use`3 -0.325690 0.044815 -7.267 3.66e-13 ***
## `TV Use`4 -0.348204 0.041004 -8.492 < 2e-16 ***
## `TV Use`5 -0.340423 0.043733 -7.784 7.02e-15 ***
## `TV Use`6 -0.320691 0.053069 -6.043 1.51e-09 ***
## `TV Use`7 -0.183544 0.046842 -3.918 8.92e-05 ***
## `Electronic Device Use`2 -0.086766 0.041775 -2.077 0.037801 *
## `Electronic Device Use`3 -0.060600 0.044518 -1.361 0.173435
## `Electronic Device Use`4 0.073060 0.042266 1.729 0.083885 .
## `Electronic Device Use`5 0.161474 0.044835 3.602 0.000316 ***
## `Electronic Device Use`6 0.271814 0.051725 5.255 1.48e-07 ***
## `Electronic Device Use`7 0.598322 0.040657 14.716 < 2e-16 ***
## race_di 0.010429 0.024433 0.427 0.669493
## age_n13 -1.573645 0.589823 -2.668 0.007630 **
## age_n14 -1.340033 0.398545 -3.362 0.000773 ***
## age_n15 -1.282802 0.396873 -3.232 0.001228 **
## age_n16 -1.163473 0.396359 -2.935 0.003331 **
## age_n17 -1.130246 0.396551 -2.850 0.004369 **
## age_n18 -1.178865 0.397991 -2.962 0.003056 **
## sex_n1 -0.624138 0.024398 -25.581 < 2e-16 ***
## grade_n10 -0.101400 0.042175 -2.404 0.016205 *
## grade_n11 -0.274118 0.055451 -4.943 7.68e-07 ***
## grade_n12 -0.336748 0.067046 -5.023 5.10e-07 ***
## year2009 -0.061259 0.038475 -1.592 0.111342
## year2011 0.062332 0.038267 1.629 0.103340
## year2013 0.069808 0.039229 1.779 0.075159 .
## year2015 0.161303 0.038076 4.236 2.27e-05 ***
## bmirnormal -0.005202 0.041765 -0.125 0.900878
## bmiroverw 0.103078 0.047598 2.166 0.030342 *
## bmirobese 0.305885 0.051917 5.892 3.82e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 50767 on 65973 degrees of freedom
## Residual deviance: 49326 on 65943 degrees of freedom
## (9109 observations deleted due to missingness)
## AIC: 49388
##
## Number of Fisher Scoring iterations: 5
variables = c(names(x)[idx], cvarsplus)[-4]
plots = list()
for (v in variables) {
sel= grep(v,rownames(bmle))
plot(bmle[sel,1],ylim=range(bmle[sel,]), xaxt='n')
segments(x0=1:length(sel),y0=bmle[sel,2],y1=bmle[sel,3])
axis(side=1, at=1:length(sel), labels=rownames(bmle)[sel])
title(v)
}






Outcome 4. Commit suicide
idy= 4; idx= c(1,2)
datareg= data.frame(y[,idy], xf[,idx], dataf[,c(cvars,cvarsplus)])
names(datareg)[1]= 'y'
names(datareg)[2:(1+length(idx))]= names(x)[idx]
fit1= glm(y ~ ., data=datareg, family=binomial(link='logit'))
bmle= getci(fit1)
summary(fit1)
##
## Call:
## glm(formula = y ~ ., family = binomial(link = "logit"), data = datareg)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0068 -0.5715 -0.4772 -0.3800 2.5000
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.135614 0.431020 2.635 0.008421 **
## `TV Use`2 -0.194494 0.044155 -4.405 1.06e-05 ***
## `TV Use`3 -0.313922 0.047452 -6.616 3.70e-11 ***
## `TV Use`4 -0.316636 0.043275 -7.317 2.54e-13 ***
## `TV Use`5 -0.302101 0.046185 -6.541 6.11e-11 ***
## `TV Use`6 -0.211911 0.055224 -3.837 0.000124 ***
## `TV Use`7 -0.015401 0.048920 -0.315 0.752896
## `Electronic Device Use`2 -0.157917 0.043531 -3.628 0.000286 ***
## `Electronic Device Use`3 -0.110419 0.046131 -2.394 0.016683 *
## `Electronic Device Use`4 -0.049058 0.044275 -1.108 0.267846
## `Electronic Device Use`5 0.070398 0.046832 1.503 0.132784
## `Electronic Device Use`6 0.164596 0.054154 3.039 0.002370 **
## `Electronic Device Use`7 0.480891 0.042479 11.321 < 2e-16 ***
## race_di -0.114740 0.025736 -4.458 8.26e-06 ***
## age_n13 -3.133853 0.677255 -4.627 3.70e-06 ***
## age_n14 -2.558785 0.428660 -5.969 2.38e-09 ***
## age_n15 -2.493689 0.426911 -5.841 5.18e-09 ***
## age_n16 -2.281050 0.426236 -5.352 8.72e-08 ***
## age_n17 -2.206070 0.426246 -5.176 2.27e-07 ***
## age_n18 -2.223216 0.427675 -5.198 2.01e-07 ***
## sex_n1 -0.723560 0.025933 -27.901 < 2e-16 ***
## grade_n10 -0.183183 0.044395 -4.126 3.69e-05 ***
## grade_n11 -0.428616 0.058452 -7.333 2.25e-13 ***
## grade_n12 -0.583331 0.071001 -8.216 < 2e-16 ***
## year2009 -0.103767 0.040032 -2.592 0.009539 **
## year2011 0.063665 0.039725 1.603 0.109018
## year2013 0.034488 0.040988 0.841 0.400115
## year2015 0.208038 0.039767 5.231 1.68e-07 ***
## bmirnormal 0.007587 0.043943 0.173 0.862925
## bmiroverw 0.153673 0.049928 3.078 0.002085 **
## bmirobese 0.372927 0.054466 6.847 7.54e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 46296 on 59411 degrees of freedom
## Residual deviance: 44584 on 59381 degrees of freedom
## (15671 observations deleted due to missingness)
## AIC: 44646
##
## Number of Fisher Scoring iterations: 5
Including age 13 leads to a very large (imprecisely estimated) coefficient in this regression.
variables = c(names(x)[idx], cvarsplus)[-4]
plots = list()
for (v in variables) {
sel= grep(v,rownames(bmle))
if (v == 'age_n') sel=sel[-1]
plot(bmle[sel,1],ylim=range(bmle[sel,]), xaxt='n')
segments(x0=1:length(sel),y0=bmle[sel,2],y1=bmle[sel,3])
axis(side=1, at=1:length(sel), labels=rownames(bmle)[sel])
title(v)
}






Outcome 5. Doctor suicide
idy= 5; idx= c(1,2)
datareg= data.frame(y[,idy], xf[,idx], dataf[,c(cvars,cvarsplus)])
names(datareg)[1]= 'y'
names(datareg)[2:(1+length(idx))]= names(x)[idx]
fit1= glm(y ~ ., data=datareg, family=binomial(link='logit'))
bmle= getci(fit1)
summary(fit1)
##
## Call:
## glm(formula = y ~ ., family = binomial(link = "logit"), data = datareg)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6275 -0.5342 -0.4495 -0.3580 2.5300
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.18904 0.43867 0.431 0.666510
## `TV Use`2 -0.21398 0.04669 -4.583 4.58e-06 ***
## `TV Use`3 -0.34028 0.05036 -6.757 1.41e-11 ***
## `TV Use`4 -0.32136 0.04574 -7.025 2.13e-12 ***
## `TV Use`5 -0.33165 0.04911 -6.753 1.44e-11 ***
## `TV Use`6 -0.23572 0.05894 -3.999 6.35e-05 ***
## `TV Use`7 -0.02023 0.05184 -0.390 0.696343
## `Electronic Device Use`2 -0.12245 0.04649 -2.634 0.008448 **
## `Electronic Device Use`3 -0.10660 0.04956 -2.151 0.031493 *
## `Electronic Device Use`4 -0.03699 0.04744 -0.780 0.435593
## `Electronic Device Use`5 0.10394 0.04985 2.085 0.037070 *
## `Electronic Device Use`6 0.16698 0.05789 2.885 0.003919 **
## `Electronic Device Use`7 0.51062 0.04514 11.313 < 2e-16 ***
## race_di -0.01424 0.02726 -0.522 0.601511
## age_n13 -2.25261 0.68276 -3.299 0.000969 ***
## age_n14 -1.87280 0.43612 -4.294 1.75e-05 ***
## age_n15 -1.82909 0.43430 -4.212 2.54e-05 ***
## age_n16 -1.65817 0.43385 -3.822 0.000132 ***
## age_n17 -1.59442 0.43417 -3.672 0.000240 ***
## age_n18 -1.61010 0.43592 -3.694 0.000221 ***
## sex_n1 -0.71767 0.02766 -25.945 < 2e-16 ***
## grade_n10 -0.13464 0.04722 -2.851 0.004357 **
## grade_n11 -0.37488 0.06241 -6.007 1.89e-09 ***
## grade_n12 -0.49513 0.07580 -6.532 6.50e-11 ***
## year2009 -0.06231 0.04319 -1.443 0.149114
## year2011 0.13377 0.04290 3.119 0.001817 **
## year2013 0.11466 0.04389 2.612 0.008996 **
## year2015 0.30047 0.04248 7.073 1.52e-12 ***
## bmirnormal 0.01361 0.04673 0.291 0.770806
## bmiroverw 0.15285 0.05314 2.876 0.004023 **
## bmirobese 0.39797 0.05770 6.897 5.30e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 41872 on 58470 degrees of freedom
## Residual deviance: 40364 on 58440 degrees of freedom
## (16612 observations deleted due to missingness)
## AIC: 40426
##
## Number of Fisher Scoring iterations: 5
variables = c(names(x)[idx], cvarsplus)[-4]
plots = list()
for (v in variables) {
sel= grep(v,rownames(bmle))
if (v == 'age_n') sel=sel[-1]
plot(bmle[sel,1],ylim=range(bmle[sel,]), xaxt='n')
segments(x0=1:length(sel),y0=bmle[sel,2],y1=bmle[sel,3])
axis(side=1, at=1:length(sel), labels=rownames(bmle)[sel])
title(v)
}






REGRESSION ANALYSIS
Issues with Orben’s et al analysis
Orben et al run linear regression, but all 5 outcomes are discrete
Sex and grade highly significant, but ignored in OP’s analysis
Graphical parameters
Use the re-defined treatment variables (see data pre-treatment).
x= data[,x_vars]; names(x)= x_names
Include one year coefficient in the control variable panel and set readable names for treatments and controls. Include year dummies but only show them as one variable in the BSCA. Include dummies for ages 12 and 13, to control for effects of these students being misaligned with their grade.
cvars= "race_di"
cvarsplus= c("age12", "age13", "age_n", "sex_n", "grade_n", "year", "bmi")
c_names = c('Race', 'Aged 12', 'Aged 13', 'Age', 'Male', 'Grade', 'Year', 'BMI')
data[,'year']= factor(data[,'year']) # keep year as a factor
id_years=c(12:14) # hide all but one year coefficient (same variable -> included together)
x_labels = c('TV use: medium', 'TV use: high', 'Electronic device use')
var_labels = c_names
Transform the y labels into the odds ratio (from min=0 to max=1), by exponentiating the coefficient.
y_labels = seq(0.8, 2, 0.2); names(y_labels) = log(y_labels)
Only show 50 models and decrease the size of the legend to nicely combine in the grid figure.
maxmodels = 50
legend_size = 1
Outcome 1. Loneliness
Regress the first outcome (loneliness) on the two treatments. Store the data in datareg.
idy= 1; idx= c(1,2)
datareg= data.frame(y[,idy], x[,idx], data[,c(cvars,cvarsplus)])
names(datareg) = c('y', names(x)[idx], c_names)
sel= rowSums(is.na(datareg))==0
datareg= datareg[sel,]
Fit logistic regression model via MLE. Higher TV use associated to less loneliness, higher ED use to more loneliness. The effect of age is mainly from 12 to $$13 years old. There’s higher loneliness at higher grades. Sex has a clear effect. Higher BMI associated to less loneliness (not intuitive why this might be the case).
reg= y ~ .
fit1= glm(reg, data=datareg, family=binomial(link='logit'))
bmle= getci(fit1)
summary(fit1)
##
## Call:
## glm(formula = reg, family = binomial(link = "logit"), data = datareg)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6353 -0.8806 -0.6784 1.2890 2.0571
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.409322 0.128734 -10.948 < 2e-16 ***
## `TV Use`medium -0.198743 0.019707 -10.085 < 2e-16 ***
## `TV Use`high -0.078471 0.031469 -2.494 0.0126 *
## `Electronic Device Use` 0.599879 0.027129 22.112 < 2e-16 ***
## Race -0.168150 0.017986 -9.349 < 2e-16 ***
## `Aged 12` 0.872162 0.379710 2.297 0.0216 *
## `Aged 13` -0.042573 0.317175 -0.134 0.8932
## Age 0.116986 0.014841 7.882 3.21e-15 ***
## Male -0.866052 0.017922 -48.323 < 2e-16 ***
## Grade -0.120497 0.016110 -7.480 7.45e-14 ***
## Year2009 -0.123163 0.027432 -4.490 7.13e-06 ***
## Year2011 -0.064442 0.027838 -2.315 0.0206 *
## Year2013 -0.064158 0.028811 -2.227 0.0260 *
## Year2015 -0.033438 0.028102 -1.190 0.2341
## BMI 0.012493 0.001723 7.252 4.10e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 80469 on 66303 degrees of freedom
## Residual deviance: 77346 on 66289 degrees of freedom
## AIC: 77376
##
## Number of Fisher Scoring iterations: 4
Run EBIC-based model selection. This is an approximation to Bayesian model selection under the unit information prior, and BetaBin(1,1) on the model space. Unfortunately for logistic regression this uses a slowish R implementation. We set includevars=1 to force the inclusion of the intercept.
ms= mombf:::modelSelectionGLM(reg, data=datareg, includevars=1, familyglm= binomial(link='logit'), priorDelta=modelbbprior(1,1))
## Enumerating 1024 models..........
Most posterior probability is assigned to the 2 top models, i.e. there’s little uncertainty on what variables should be included.
pp= postProb(ms)
head(pp)
| 926 |
1,2,3,4,5,8,9,10,15 |
binomial logit |
0.8742234 |
| 990 |
1,2,3,4,5,6,8,9,10,15 |
binomial logit |
0.1131518 |
| 958 |
1,2,3,4,5,7,8,9,10,15 |
binomial logit |
0.0103517 |
| 1022 |
1,2,3,4,5,6,7,8,9,10,15 |
binomial logit |
0.0022227 |
| 928 |
1,2,3,4,5,8,9,10,11,12,13,14,15 |
binomial logit |
0.0000404 |
| 992 |
1,2,3,4,5,6,8,9,10,11,12,13,14,15 |
binomial logit |
0.0000089 |
We obtain BMA estimates, 95% posterior intervals and marginal posterior inclusion probabilities. Both TV and ED use have very high posterior prob. Similarly strong evidence for including sex, grade and bmi. Small post prob to race, the adjustment variable considered by Orben et al. Age and year are dropped.
b= coef(ms)
b
## estimate 2.5% 97.5% margpp
## (Intercept) -1.441659e+00 -1.690104818 -1.19286256 1.000000e+00
## `TV Use`medium -2.005901e-01 -0.239062280 -0.16249278 1.000000e+00
## `TV Use`high -7.880829e-02 -0.140010147 -0.01713051 1.000000e+00
## `Electronic Device Use` 6.034143e-01 0.551856512 0.65637300 1.000000e+00
## Race -1.675038e-01 -0.202968973 -0.13263115 1.000000e+00
## `Aged 12` 9.763232e-02 0.000000000 1.17053596 1.153838e-01
## `Aged 13` -1.092674e-03 0.000000000 0.00000000 1.257559e-02
## Age 1.118438e-01 0.083224218 0.14050808 1.000000e+00
## Male -8.653192e-01 -0.900475527 -0.83084152 1.000000e+00
## Grade -1.157834e-01 -0.146725928 -0.08458646 1.000000e+00
## Year2009 -1.564692e-05 0.000000000 0.00000000 5.040349e-05
## Year2011 -7.833472e-06 0.000000000 0.00000000 5.040349e-05
## Year2013 -9.228734e-06 0.000000000 0.00000000 5.040349e-05
## Year2015 -8.685394e-06 0.000000000 0.00000000 5.040349e-05
## BMI 1.277104e-02 0.009421761 0.01616065 1.000000e+00
Odds-ratios help characterize the practical importance of the estimated treatment effects (use getOR defined in functions.R for convenience).
getOR(b, treat='TV Use') %>% format(digits=2, scientific=FALSE) # medium and high treatments
## OR CI.low CI.up
## `TV Use`medium "0.82" "0.79" "0.85"
## `TV Use`high "0.92" "0.87" "0.98"
getOR(b, treat='Electronic', treatvals=round((1:6)/6, 2)) %>% format(digits=3, scientific=FALSE) #increasing treatment by 1,...,6 units
## OR CI.low CI.up
## Electronic 0.17 "1.11" "1.10" "1.12"
## Electronic 0.33 "1.22" "1.20" "1.24"
## Electronic 0.5 "1.35" "1.32" "1.39"
## Electronic 0.67 "1.50" "1.45" "1.55"
## Electronic 0.83 "1.65" "1.58" "1.72"
## Electronic 1 "1.83" "1.74" "1.93"
We could obtain parameter estimates and posterior intervals for the top 100 models.
bmodels= coefByModel(ms, maxmodels=100, alpha=0.05)
sel= "`Electronic Device Use`"
bsel= cbind(postmean=bmodels$postmean[,sel],ci.low=bmodels$ci.low[,sel],ci.up=bmodels$ci.up[,sel])
head(round(bsel,3))
Produce the BMA SCA plots. Clearly TV and ED use have effects of opposing signs.
idx_fit=c(2:4)
single_bsca(ms, coefidx=idx_fit, omitvars=c(1, idx_fit, id_years), x.labels=x_labels, var.labels=var_labels, y.labels=y_labels, maxmodels=maxmodels, legend.cex=legend_size)

# save the plot
bscas = list()
bscas[[idy]] = recordPlot()
Outcome 2. Think suicide
idy= 2; idx= c(1,2); idx_fit=c(2:4)
reg= y ~ .
datareg= data.frame(y[,idy], x[,idx], data[,c(cvars,cvarsplus)])
names(datareg)[1]= 'y'
names(datareg)[2:(1+length(idx))]= names(x)[idx]
sel= rowSums(is.na(datareg))==0
datareg= datareg[sel,]
ms= mombf:::modelSelectionGLM(reg, data=datareg, includevars=1, familyglm= binomial(link='logit'), priorDelta=modelbbprior(1,1))
## Enumerating 1024 models..........
b= coef(ms)
getOR(b, treat='TV Use') %>% format(digits=2, scientific=FALSE) # medium and high treatments
## OR CI.low CI.up
## `TV Use`medium "0.79" "0.76" "0.83"
## `TV Use`high "0.95" "0.89" "1.03"
getOR(b, treat='Electronic', treatvals=round((1:6)/6, 2)) %>% format(digits=3, scientific=FALSE) #increasing treatment by 1,...,6 units
## OR CI.low CI.up
## Electronic 0.17 "1.11" "1.10" "1.13"
## Electronic 0.33 "1.23" "1.20" "1.26"
## Electronic 0.5 "1.37" "1.33" "1.41"
## Electronic 0.67 "1.52" "1.46" "1.59"
## Electronic 0.83 "1.69" "1.60" "1.78"
## Electronic 1 "1.88" "1.76" "2.00"
single_bsca(ms, coefidx=idx_fit, omitvars=c(1, idx_fit, id_years), x.labels=x_labels, var.labels=var_labels, y.labels=y_labels, maxmodels=maxmodels, legend.cex=legend_size)

bscas[[idy]] = recordPlot()
Outcome 3. Plan suicide
idy= 3; idx= c(1,2); idx_fit=c(2:4)
reg= y ~ .
datareg= data.frame(y[,idy], x[,idx], data[,c(cvars,cvarsplus)])
names(datareg)[1]= 'y'
names(datareg)[2:(1+length(idx))]= names(x)[idx]
sel= rowSums(is.na(datareg))==0
datareg= datareg[sel,]
ms= mombf:::modelSelectionGLM(reg, data=datareg, includevars=1, familyglm= binomial(link='logit'), priorDelta=modelbbprior(1,1))
## Enumerating 1024 models..........
b= coef(ms)
getOR(b, treat='TV Use') %>% format(digits=2, scientific=FALSE) # medium and high treatments
## OR CI.low CI.up
## `TV Use`medium "0.79" "0.75" "0.83"
## `TV Use`high "0.96" "0.89" "1.04"
getOR(b, treat='Electronic', treatvals=round((1:6)/6, 2)) %>% format(digits=3, scientific=FALSE) #increasing treatment by 1,...,6 units
## OR CI.low CI.up
## Electronic 0.17 "1.11" "1.10" "1.12"
## Electronic 0.33 "1.22" "1.20" "1.25"
## Electronic 0.5 "1.36" "1.31" "1.41"
## Electronic 0.67 "1.51" "1.44" "1.58"
## Electronic 0.83 "1.67" "1.57" "1.77"
## Electronic 1 "1.85" "1.72" "1.98"
single_bsca(ms, coefidx=idx_fit, omitvars=c(1, idx_fit, id_years), x.labels=x_labels, var.labels=var_labels, y.labels=y_labels, maxmodels=maxmodels, legend.cex=legend_size)

bscas[[idy]] = recordPlot()
Outcome 4. Commit suicide
idy= 4; idx= c(1,2); idx_fit=c(2:4)
reg= y ~ .
datareg= data.frame(y[,idy], x[,idx], data[,c(cvars,cvarsplus)])
names(datareg)[1]= 'y'
names(datareg)[2:(1+length(idx))]= names(x)[idx]
sel= rowSums(is.na(datareg))==0
datareg= datareg[sel,]
ms= mombf:::modelSelectionGLM(reg, data=datareg, includevars=1, familyglm= binomial(link='logit'), priorDelta=modelbbprior(1,1))
## Enumerating 1024 models..........
b= coef(ms)
getOR(b, treat='TV Use') %>% format(digits=2, scientific=FALSE) # medium and high treatments
## OR CI.low CI.up
## `TV Use`medium "0.81" "0.76" "0.85"
## `TV Use`high "1.12" "1.03" "1.22"
getOR(b, treat='Electronic', treatvals=round((1:6)/6, 2)) %>% format(digits=3, scientific=FALSE) #increasing treatment by 1,...,6 units
## OR CI.low CI.up
## Electronic 0.17 "1.09" "1.08" "1.11"
## Electronic 0.33 "1.19" "1.16" "1.22"
## Electronic 0.5 "1.30" "1.25" "1.34"
## Electronic 0.67 "1.41" "1.35" "1.49"
## Electronic 0.83 "1.54" "1.45" "1.63"
## Electronic 1 "1.68" "1.56" "1.81"
single_bsca(ms, coefidx=idx_fit, omitvars=c(1, idx_fit, id_years), x.labels=x_labels, var.labels=var_labels, y.labels=y_labels, maxmodels=maxmodels, legend.cex=legend_size)

bscas[[idy]] = recordPlot()
Outcome 5. Doctor suicide
idy= 5; idx= c(1,2); idx_fit=c(2:4)
reg= y ~ .
datareg= data.frame(y[,idy], x[,idx], data[,c(cvars,cvarsplus)])
names(datareg)[1]= 'y'
names(datareg)[2:(1+length(idx))]= names(x)[idx]
sel= rowSums(is.na(datareg))==0
datareg= datareg[sel,]
ms= mombf:::modelSelectionGLM(reg, data=datareg, includevars=1, familyglm= binomial(link='logit'), priorDelta=modelbbprior(1,1))
## Enumerating 1024 models..........
b= coef(ms)
getOR(b, treat='TV Use') %>% format(digits=2, scientific=FALSE) # medium and high treatments
## OR CI.low CI.up
## `TV Use`medium "0.80" "0.76" "0.85"
## `TV Use`high "1.14" "1.04" "1.24"
getOR(b, treat='Electronic', treatvals=round((1:6)/6, 2)) %>% format(digits=3, scientific=FALSE) #increasing treatment by 1,...,6 units
## OR CI.low CI.up
## Electronic 0.17 "1.10" "1.08" "1.11"
## Electronic 0.33 "1.19" "1.16" "1.23"
## Electronic 0.5 "1.31" "1.26" "1.36"
## Electronic 0.67 "1.43" "1.36" "1.51"
## Electronic 0.83 "1.56" "1.46" "1.67"
## Electronic 1 "1.71" "1.58" "1.86"
single_bsca(ms, coefidx=idx_fit, omitvars=c(1, idx_fit, id_years), x.labels=x_labels, var.labels=var_labels, y.labels=y_labels, maxmodels=maxmodels, legend.cex=legend_size)

bscas[[idy]] = recordPlot()
SUBGROUP ANALYSIS
We perform a gender subgroup analysis. The variables are parameterized as described in the paper. Interaction terms are included only if the corresponding main variable is included. BMA is done through Gibbs sampling.
idx=c(1,2); idg="Male"
{
options(na.action='na.pass')
x.reg = model.matrix(~ ., x[,idx])[, -1]
colnames(x.reg) = x_labels
}
ms_inter = list(); coef_inter = list()
for (idy in 1:length(y_vars)) {
yname = y_names[idy]
message(yname)
datareg = na.omit(data.frame(y[,idy], x.reg, data[,c(cvars,cvarsplus)]))
names(datareg) = c('y', colnames(x.reg), c_names)
# treatments
idt = colnames(x.reg)
datareg[idt] = datareg[idt] - 1/2
# group
g = datareg[[idg]]; rho = sum(g == 1)/length(g)
datareg[[idg]] = ifelse(g == 1, 1-rho, -rho)
# interaction
inter = datareg[idt]*datareg[[idg]]
colnames(inter) = paste0(colnames(inter), " × male")
d = mombf:::createDesign(y ~ ., cbind(datareg, inter))
p=length(d$constraints); pi=ncol(inter)
d$constraints[(p-pi+1):p] = 1:pi+1
ms_inter[[yname]] = mombf:::modelSelection(
d$y, d$x, includevars=1, family="binomial",
priorDelta=modelbbprior(1,1), priorCoef=bicprior(), priorGroup=bicprior(),
niter=NITER_YRBS, groups=d$groups, constraints=d$constraints,
)
coef_inter[[yname]] = coef(ms_inter[[yname]])
}
## loneliness
## Greedy searching posterior mode... Done.
## Running Gibbs sampler........... Done.
## think suicide
## Greedy searching posterior mode... Done.
## Running Gibbs sampler........... Done.
## plan suicide
## Greedy searching posterior mode... Done.
## Running Gibbs sampler........... Done.
## commit suicide
## Greedy searching posterior mode... Done.
## Running Gibbs sampler........... Done.
## doctor suicide
## Greedy searching posterior mode... Done.
## Running Gibbs sampler........... Done.
We can never rule out a zero gender effect.
supplement$mbsca_inter = multi_bsca(
coef_inter, ms=ms_inter, conversion=exp, y.scale='Odds ratio',
treatments=tail(rownames(coef_inter$loneliness), n=length(x_labels)),
)
supplement$mbsca_inter
